CASE_3

Liam Phan, Michael Bigler, Tania Loureiro, William Elkiess, Dakota Cuellar and Ilyana El Mendili

2023-05-04

Packages

library(DT)
library(adabag)
FALSE Loading required package: rpart
FALSE Loading required package: caret
FALSE Loading required package: ggplot2
FALSE Loading required package: lattice
FALSE Loading required package: foreach
FALSE Loading required package: doParallel
FALSE Loading required package: iterators
FALSE Loading required package: parallel
library(rpart.plot)
library(pROC)
FALSE Type 'citation("pROC")' for a citation.
FALSE 
FALSE Attaching package: 'pROC'
FALSE The following objects are masked from 'package:stats':
FALSE 
FALSE     cov, smooth, var
library(summarytools)
library(corrplot)
FALSE corrplot 0.92 loaded
library(dplyr)
FALSE 
FALSE Attaching package: 'dplyr'
FALSE The following objects are masked from 'package:stats':
FALSE 
FALSE     filter, lag
FALSE The following objects are masked from 'package:base':
FALSE 
FALSE     intersect, setdiff, setequal, union
library(GGally)
FALSE Registered S3 method overwritten by 'GGally':
FALSE   method from   
FALSE   +.gg   ggplot2
library(fastDummies)
library(ggcorrplot)
library(klaR)
FALSE Loading required package: MASS
FALSE 
FALSE Attaching package: 'MASS'
FALSE The following object is masked from 'package:dplyr':
FALSE 
FALSE     select
library(psych)
FALSE 
FALSE Attaching package: 'psych'
FALSE The following objects are masked from 'package:ggplot2':
FALSE 
FALSE     %+%, alpha
library(MASS)
# library(ggord)
library(devtools)
FALSE Loading required package: usethis
library(ggplot2)
library(ggthemes)
library(GGally)
library(caret)
library(splitTools)
library(rpart)
library(xgboost)
FALSE 
FALSE Attaching package: 'xgboost'
FALSE The following object is masked from 'package:dplyr':
FALSE 
FALSE     slice
library(caTools)
library(dplyr)
library(caret)
library(naniar)

CM_Function <- function(cm) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#2F4F4E')
  text(195, 435, 'No', cex=1.2)
  rect(250, 430, 340, 370, col='#0D8387')
  text(295, 435, 'Yes', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#0D8387')
  rect(250, 305, 340, 365, col='#2F4F4E')
  text(140, 400, 'No', cex=1.2, srt=90)
  text(140, 335, 'Yes', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}  

#options(repos = c(
    #fawda123 = 'https://fawda123.r-universe.dev',
    #CRAN = 'https://cloud.r-project.org'))

# Install ggord
#install.packages('ggord')

Data and Feature Engineering

df <- readxl::read_xls('Cchurn.xls')
df$international_plan <- factor(df$international_plan, levels = c('no', 'yes'), labels = c('0','1'))
df$voice_mail_plan <- factor(df$voice_mail_plan, levels = c('no', 'yes'), labels = c('0','1'))
df$churn <- factor(df$churn, levels = c('no', 'yes'), labels = c('0','1'))

Summary

print(summarytools::dfSummary(df), method = 'render')

Data Frame Summary

df

Dimensions: 5000 x 18
Duplicates: 0
No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 account_length [numeric]
Mean (sd) : 100.3 (39.7)
min ≤ med ≤ max:
1 ≤ 100 ≤ 243
IQR (CV) : 54 (0.4)
218 distinct values 5000 (100.0%) 0 (0.0%)
2 international_plan [factor]
1. 0
2. 1
4527(90.5%)
473(9.5%)
5000 (100.0%) 0 (0.0%)
3 voice_mail_plan [factor]
1. 0
2. 1
3677(73.5%)
1323(26.5%)
5000 (100.0%) 0 (0.0%)
4 number_vmail_messages [numeric]
Mean (sd) : 7.8 (13.5)
min ≤ med ≤ max:
0 ≤ 0 ≤ 52
IQR (CV) : 17 (1.7)
48 distinct values 5000 (100.0%) 0 (0.0%)
5 total_day_minutes [numeric]
Mean (sd) : 180.3 (53.9)
min ≤ med ≤ max:
0 ≤ 180.1 ≤ 351.5
IQR (CV) : 72.5 (0.3)
1961 distinct values 5000 (100.0%) 0 (0.0%)
6 total_day_calls [numeric]
Mean (sd) : 100 (19.8)
min ≤ med ≤ max:
0 ≤ 100 ≤ 165
IQR (CV) : 26 (0.2)
123 distinct values 5000 (100.0%) 0 (0.0%)
7 total_day_charge [numeric]
Mean (sd) : 30.6 (9.2)
min ≤ med ≤ max:
0 ≤ 30.6 ≤ 59.8
IQR (CV) : 12.3 (0.3)
1961 distinct values 5000 (100.0%) 0 (0.0%)
8 total_eve_minutes [numeric]
Mean (sd) : 200.6 (50.6)
min ≤ med ≤ max:
0 ≤ 201 ≤ 363.7
IQR (CV) : 67.7 (0.3)
1879 distinct values 5000 (100.0%) 0 (0.0%)
9 total_eve_calls [numeric]
Mean (sd) : 100.2 (19.8)
min ≤ med ≤ max:
0 ≤ 100 ≤ 170
IQR (CV) : 27 (0.2)
126 distinct values 5000 (100.0%) 0 (0.0%)
10 total_eve_charge [numeric]
Mean (sd) : 17.1 (4.3)
min ≤ med ≤ max:
0 ≤ 17.1 ≤ 30.9
IQR (CV) : 5.8 (0.3)
1659 distinct values 5000 (100.0%) 0 (0.0%)
11 total_night_minutes [numeric]
Mean (sd) : 200.4 (50.5)
min ≤ med ≤ max:
0 ≤ 200.4 ≤ 395
IQR (CV) : 67.8 (0.3)
1853 distinct values 5000 (100.0%) 0 (0.0%)
12 total_night_calls [numeric]
Mean (sd) : 99.9 (20)
min ≤ med ≤ max:
0 ≤ 100 ≤ 175
IQR (CV) : 26 (0.2)
131 distinct values 5000 (100.0%) 0 (0.0%)
13 total_night_charge [numeric]
Mean (sd) : 9 (2.3)
min ≤ med ≤ max:
0 ≤ 9 ≤ 17.8
IQR (CV) : 3.1 (0.3)
1028 distinct values 5000 (100.0%) 0 (0.0%)
14 total_intl_minutes [numeric]
Mean (sd) : 10.3 (2.8)
min ≤ med ≤ max:
0 ≤ 10.3 ≤ 20
IQR (CV) : 3.5 (0.3)
170 distinct values 5000 (100.0%) 0 (0.0%)
15 total_intl_calls [numeric]
Mean (sd) : 4.4 (2.5)
min ≤ med ≤ max:
0 ≤ 4 ≤ 20
IQR (CV) : 3 (0.6)
21 distinct values 5000 (100.0%) 0 (0.0%)
16 total_intl_charge [numeric]
Mean (sd) : 2.8 (0.7)
min ≤ med ≤ max:
0 ≤ 2.8 ≤ 5.4
IQR (CV) : 0.9 (0.3)
170 distinct values 5000 (100.0%) 0 (0.0%)
17 number_customer_service_calls [numeric]
Mean (sd) : 1.6 (1.3)
min ≤ med ≤ max:
0 ≤ 1 ≤ 9
IQR (CV) : 1 (0.8)
0:1023(20.5%)
1:1786(35.7%)
2:1127(22.5%)
3:665(13.3%)
4:252(5.0%)
5:96(1.9%)
6:34(0.7%)
7:13(0.3%)
8:2(0.0%)
9:2(0.0%)
5000 (100.0%) 0 (0.0%)
18 churn [factor]
1. 0
2. 1
4293(85.9%)
707(14.1%)
5000 (100.0%) 0 (0.0%)

Generated by summarytools 1.0.1 (R version 4.2.3)
2023-05-04

  • We have no missing values -> perfect
  • Heavily uneven counts of dependent variable (86 % no / 14 % yes) -> maybe sample for equality / maybe not because we loose information of other data
  • Independent variables are on different scales -> standardize
  • two (maybe three) categorical predictors: International plan / voice_mail_plan (/ maybe number_customer_service_calls) -> dummy encode -> not necessary as already 0 and 1
  • Rest of data is numeric and most of the variables looks normally distributed with exception of number_vmail_messages and totat_intl_calls
    • transform these value to make them normal?
    • maybe make parts of them categorical? (recieving voice mail or not, calling internationally or not)
    • or maybe the categorical values that we have already give an indication for this
    • Test normality of variables
  • Can variables be combined? We have day / eve / night / intl calls and for each of them minutes / calls / charge. Maybe we can combine this into one metric. Maybe average cost per minute or average cost per call?

Correlation Plot

df_numeric <- select_if(df, is.numeric)  # Subset numeric columns with dplyr

M <- cor(df_numeric)

p.mat <- cor_pmat(df_numeric)

ggcorrplot(M, hc.order = TRUE, type = "lower", lab = TRUE, p.mat = p.mat, sig.level=0.05, lab_size = 2, tl.cex = 10,outline.col = "white", ggtheme = ggplot2::theme_minimal(), colors = c("#823038", "white", "#2596be")) 

Proves theory from before -> we can make one metric out of charge and minutes –> charge / minutes

Data Engineering

df$total_day_charge_per_minute <- ifelse(df$total_day_minutes == 0, 0, df$total_day_charge / df$total_day_minutes)
df$total_eve_charge_per_minute <- ifelse(df$total_eve_minutes == 0, 0, df$total_eve_charge / df$total_eve_minutes)
df$total_night_charge_per_minute <- ifelse(df$total_night_minutes == 0, 0, df$total_night_charge / df$total_night_minutes)
df$total_intl_charge_per_minute <- ifelse(df$total_intl_minutes == 0, 0, df$total_intl_charge / df$total_intl_minutes)
df <- subset(df, select = -c(total_day_charge, total_day_minutes, total_eve_charge, total_eve_minutes, total_night_charge, total_night_minutes, total_intl_charge, total_intl_minutes))

Correlation Plot

df_numeric <- select_if(df, is.numeric)  # Subset numeric columns with dplyr

M <- cor(df_numeric)

p.mat <- cor_pmat(df_numeric)

ggcorrplot(M, hc.order = TRUE, type = "lower", lab = TRUE, p.mat = p.mat, sig.level=0.05, lab_size = 2, tl.cex = 10,outline.col = "white", ggtheme = ggplot2::theme_minimal(), colors = c("#823038", "white", "#2596be")) 

Now we have non-correlated data

Relationship between variables

# theme_set(theme_minimal())
# 
# ggpairs(
#   data = df,
#   columns = c(1:9,11:14),
#   mapping = aes(col = churn, alpha = .9)
# ) +
#   scale_fill_colorblind() +
#   scale_color_colorblind()

We see that data is hard to seperate linearly between the classes. Therefore one can introduce new features of higher order or use methods which do not need the data to be seperable linearly.

Adding features of higher order

Only squaring as we have no negative data. Cubing would be needed with negative data.

# squared
df2 <- df^2
df2 <- df2[,-c(2,3,10)]
colnames(df2) <- paste0(colnames(df2), '_sqd')

df <- cbind(df,df2)

Relationship between data in higher order

# theme_set(theme_minimal())
# 
# ggpairs(
#   data = df,
#   columns = c(1:9, 11:25),
#   mapping = aes(col = churn, alpha = .9)
# ) +
#   scale_fill_colorblind() +
#   scale_color_colorblind()

Sampling Methods

As we have unbalanced data we need to use a sampling method to balance the classes. Hereby there are four different methods. OVER / UNDER / BOTH / ROSE.

library(ROSE)
FALSE Loaded ROSE 0.0-4
# OVER
df_OVER <- ovun.sample(churn~., data = df, method = "over")$data

table(df$churn)
FALSE 
FALSE    0    1 
FALSE 4293  707
table(df_OVER$churn)
FALSE 
FALSE    0    1 
FALSE 4293 4338
# UNDER
df_UNDER <- ovun.sample(churn~., data = df, method = "under")$data

table(df$churn)
FALSE 
FALSE    0    1 
FALSE 4293  707
table(df_UNDER$churn)
FALSE 
FALSE   0   1 
FALSE 701 707
# BOTH
df_BOTH <- ovun.sample(churn~., data = df, method = "both")$data

table(df$churn)
FALSE 
FALSE    0    1 
FALSE 4293  707
table(df_BOTH$churn)
FALSE 
FALSE    0    1 
FALSE 2503 2497
# ROSE
df_ROSE <- ROSE(churn ~ ., data = df, seed = 1, p = 0.5)$data

Sampling Visualization

# theme_set(theme_minimal())
# 
# ggpairs(
#   data = df_ROSE,
#   columns = c(1:9, 11:25),
#   mapping = aes(col = churn, alpha = .9)
# ) +
#   scale_fill_colorblind() +
#   scale_color_colorblind() +
#   labs(title = "Machine Learning Project")
# 
# ggpairs(
#   data = df_OVER,
#   columns = c(1:9, 11:25),
#   mapping = aes(col = churn, alpha = .9)
# ) +
#   scale_fill_colorblind() +
#   scale_color_colorblind() +
#   labs(title = "Machine Learning Project")

# ggpairs(
#   data = df_UNDER,
#   columns = c(1:9, 11:25),
#   mapping = aes(col = churn, alpha = .9)
# ) +
#   scale_fill_colorblind() +
#   scale_color_colorblind() +
#   labs(title = "Machine Learning Project")

# ggpairs(
#   data = df_BOTH,
#   columns = c(1:9, 11:25),
#   mapping = aes(col = churn, alpha = .9)
# ) +
#   scale_fill_colorblind() +
#   scale_color_colorblind() +
#   labs(title = "Machine Learning Project")

Train / Test split

As we need to test the models we need to split the sampled data.

set.seed(1)
data <- df_OVER # choose which data to use df_ROSE / df_BOTH / df_UNDER / df_OVER / df
inds <- splitTools::partition(data$churn, p = c(train = 0.7, test = 0.3))
dftrain <- data[inds$train,]
dftest <- data[inds$test,]

Standardizing

As some methods need scaled data we scale the data here to be centered.

norm.value <- preProcess(dftrain, method = c("center", "scale"))
dftrain <- predict(norm.value, dftrain)
dftest <- predict(norm.value, dftest)

Predicting Models

neural net

# dftrain <- dftrain |> 
#   mutate_if(is.factor, as.character) |> 
#   mutate_if(is.character, as.numeric)
# 
# library(neuralnet)
# mod.neural <- neuralnet(churn ~ ., data = dftrain, hidden=c(15,15), linear.output = FALSE)
# 
# predicted.neural <- predict(mod.neural, dftest[,-c(10)])
# 
# confmat.neural <- confusionMatrix(data=predicted.neural, reference = dftest$churn, positive = '1')
# 
# CM_Function(confmat.neural)
# 
# roc_score.neural =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.neural, ordered=TRUE))
# plot(roc_score.neural ,main ="ROC curve")

Boosting

set.seed(123)

# train bagged model
mod.boost <- boosting(churn ~., data=dftrain)

predicted.boost <- factor(predict(mod.boost, dftest, type="class")$class)

confmat.boost <- confusionMatrix(data=predicted.boost, reference = dftest$churn, positive = '1')

CM_Function(confmat.boost)

roc_score.boost =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.boost, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.boost ,main ="ROC curve")

Ctree

tree_full <- rpart(churn ~ ., 
              data = dftrain, 
              method = "class",  # "class" because Y is a binary factor
              minbucket = 1,
              cp = 0.00001) 

# Plot tree
rpart.plot(tree_full, yesno = TRUE, digits =-6)

min_xerr<- which.min(tree_full$cptable[,"xerror"]) # select minimum cross-validation error
cp_bp <- tree_full$cptable[min_xerr,"CP"]  # find the corresponding CP value, to get the "best pruned " tree


mod.pruned_tree<- prune(tree_full, cp = cp_bp) # re-compute the tree with the selected Cp
rpart.plot(mod.pruned_tree, yesno = TRUE, digits =-3)

predicted.pruned_tree <- predict(mod.pruned_tree, dftest[,-c(10)], type = "class")

confmat.prunned_tree <- confusionMatrix(data=predicted.pruned_tree, reference = dftest$churn, positive = '1')

CM_Function(confmat.prunned_tree)

roc_score.prunned_tree =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.pruned_tree, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.prunned_tree ,main ="ROC curve")

Bagging

set.seed(123)
library(ipred)
FALSE 
FALSE Attaching package: 'ipred'
FALSE The following object is masked from 'package:adabag':
FALSE 
FALSE     bagging
# train bagged model
ames_bag1 <- bagging(
  formula = churn ~ .,
  data = dftrain, 
  nbagg = 100,  
  coob = TRUE,
  control = rpart.control(minsplit = 2, cp = 0)
  )

ames_bag1
FALSE 
FALSE Bagging classification trees with 100 bootstrap replications 
FALSE 
FALSE Call: bagging.data.frame(formula = churn ~ ., data = dftrain, nbagg = 100, 
FALSE     coob = TRUE, control = rpart.control(minsplit = 2, cp = 0))
FALSE 
FALSE Out-of-bag estimate of misclassification error:  0.0442
predicted <- factor(ifelse(predict(ames_bag1, dftest[,-c(10)], type = 'prob')[,2] >= 0.5, 1, 0))
                    
CM_Function(confusionMatrix(data=predicted, reference = dftest$churn, positive = '1'))

roc_score=roc(factor(dftest$churn, ordered=TRUE), factor(predicted, ordered=TRUE)) #AUC score
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score ,main ="ROC curve")

knn

set.seed(1)

df <- data.frame(k = seq(1, 30, 1), accuracy = rep(0, 30), sensitivity = rep(0, 30))

# iterating over different ks
for(i in 1:30){
  # nearest neighbor
  KNN1 <- knn3(y = dftrain$churn, x = dftrain[,-c(10)], k = i)

  # predictions response 
  KNN1.pred.valid.resp <- predict(KNN1, dftest[,-c(10)], type = "class")
  
  # predictions prob 
  KNN1.pred.valid.prob <- predict(KNN1, dftest[,-c(10)], type = "prob")[,2]
  
  # Confusionmatrix
  df$sensitivity[i] <- confusionMatrix(KNN1.pred.valid.resp, dftest$churn, positive = "1")$byClass[1]
  df$accuracy[i] <- confusionMatrix(KNN1.pred.valid.resp, dftest$churn, positive = "1")$overall[1]

}

# plot the k's
ggplot(df, aes(x=k)) + 
  geom_line(aes(y = sensitivity, colour = "Sensitivity")) + 
  geom_line(aes(y = accuracy, colour = "Accuracy")) + 
  labs(x = "Number of k nearest neighbours", 
       y = "Accuracy / Sensitivity", title = "Accuracy / Sensitivity regarding k") +
  theme_minimal() + 
  scale_y_continuous(name = "Sensitivity / Accuracy", limits = c(0.7, 1)) +
    scale_color_manual(name = "Values", values = c("Sensitivity" = "darkblue", "Accuracy" = "red")) + 
  xlim (1, 30)

mod.knn <- knn3(y = dftrain$churn, x = dftrain[,-c(10)], k = 2)

predicted.knn <- predict(mod.knn, dftest[,-c(10)], type = "class")

confmat.knn <- confusionMatrix(data=predicted.knn, reference = dftest$churn, positive = '1')

CM_Function(confmat.knn)

roc_score.qda =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.knn, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.qda ,main ="ROC curve")

QDA

mod.qda <- qda(churn ~., data = dftrain)

predicted.qda <- predict(mod.qda, dftest[,-c(10)])$class

confmat.qda <- confusionMatrix(data=predicted.qda, reference = dftest$churn, positive = '1')

CM_Function(confmat.qda)

roc_score.qda =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.qda, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.qda ,main ="ROC curve")

QLOG

mod.log <- glm(churn ~., data = dftrain, family = binomial(link = "probit"))

s <- step(mod.log)
FALSE Start:  AIC=6873.98
FALSE churn ~ account_length + international_plan + voice_mail_plan + 
FALSE     number_vmail_messages + total_day_calls + total_eve_calls + 
FALSE     total_night_calls + total_intl_calls + number_customer_service_calls + 
FALSE     total_day_charge_per_minute + total_eve_charge_per_minute + 
FALSE     total_night_charge_per_minute + total_intl_charge_per_minute + 
FALSE     account_length_sqd + number_vmail_messages_sqd + total_day_calls_sqd + 
FALSE     total_eve_calls_sqd + total_night_calls_sqd + total_intl_calls_sqd + 
FALSE     number_customer_service_calls_sqd + total_day_charge_per_minute_sqd + 
FALSE     total_eve_charge_per_minute_sqd + total_night_charge_per_minute_sqd + 
FALSE     total_intl_charge_per_minute_sqd
FALSE 
FALSE                                     Df Deviance    AIC
FALSE - account_length_sqd                 1   6824.0 6872.0
FALSE - total_day_charge_per_minute        1   6824.0 6872.0
FALSE - total_day_charge_per_minute_sqd    1   6824.0 6872.0
FALSE - account_length                     1   6824.2 6872.2
FALSE - total_night_calls_sqd              1   6824.4 6872.4
FALSE - total_night_calls                  1   6824.7 6872.7
FALSE - total_intl_charge_per_minute       1   6825.4 6873.4
FALSE - total_intl_charge_per_minute_sqd   1   6825.7 6873.7
FALSE <none>                                   6824.0 6874.0
FALSE - total_eve_calls                    1   6826.6 6874.6
FALSE - total_eve_calls_sqd                1   6826.9 6874.9
FALSE - total_night_charge_per_minute_sqd  1   6827.4 6875.4
FALSE - total_day_calls                    1   6827.5 6875.5
FALSE - total_night_charge_per_minute      1   6827.5 6875.5
FALSE - total_day_calls_sqd                1   6829.3 6877.3
FALSE - number_vmail_messages_sqd          1   6830.4 6878.4
FALSE - number_vmail_messages              1   6834.5 6882.5
FALSE - number_customer_service_calls      1   6848.8 6896.8
FALSE - total_intl_calls_sqd               1   6849.2 6897.2
FALSE - voice_mail_plan                    1   6854.4 6902.4
FALSE - total_eve_charge_per_minute        1   6855.4 6903.4
FALSE - total_eve_charge_per_minute_sqd    1   6855.4 6903.4
FALSE - total_intl_calls                   1   6855.7 6903.7
FALSE - number_customer_service_calls_sqd  1   6993.9 7041.9
FALSE - international_plan                 1   7418.9 7466.9
FALSE 
FALSE Step:  AIC=6871.99
FALSE churn ~ account_length + international_plan + voice_mail_plan + 
FALSE     number_vmail_messages + total_day_calls + total_eve_calls + 
FALSE     total_night_calls + total_intl_calls + number_customer_service_calls + 
FALSE     total_day_charge_per_minute + total_eve_charge_per_minute + 
FALSE     total_night_charge_per_minute + total_intl_charge_per_minute + 
FALSE     number_vmail_messages_sqd + total_day_calls_sqd + total_eve_calls_sqd + 
FALSE     total_night_calls_sqd + total_intl_calls_sqd + number_customer_service_calls_sqd + 
FALSE     total_day_charge_per_minute_sqd + total_eve_charge_per_minute_sqd + 
FALSE     total_night_charge_per_minute_sqd + total_intl_charge_per_minute_sqd
FALSE 
FALSE                                     Df Deviance    AIC
FALSE - total_day_charge_per_minute        1   6824.0 6870.0
FALSE - total_day_charge_per_minute_sqd    1   6824.0 6870.0
FALSE - total_night_calls_sqd              1   6824.4 6870.4
FALSE - total_night_calls                  1   6824.7 6870.7
FALSE - total_intl_charge_per_minute       1   6825.5 6871.5
FALSE - total_intl_charge_per_minute_sqd   1   6825.7 6871.7
FALSE <none>                                   6824.0 6872.0
FALSE - total_eve_calls                    1   6826.6 6872.6
FALSE - total_eve_calls_sqd                1   6826.9 6872.9
FALSE - total_night_charge_per_minute_sqd  1   6827.5 6873.5
FALSE - total_day_calls                    1   6827.5 6873.5
FALSE - total_night_charge_per_minute      1   6827.6 6873.6
FALSE - account_length                     1   6827.9 6873.9
FALSE - total_day_calls_sqd                1   6829.3 6875.3
FALSE - number_vmail_messages_sqd          1   6830.4 6876.4
FALSE - number_vmail_messages              1   6834.5 6880.5
FALSE - number_customer_service_calls      1   6848.8 6894.8
FALSE - total_intl_calls_sqd               1   6849.2 6895.2
FALSE - voice_mail_plan                    1   6854.4 6900.4
FALSE - total_eve_charge_per_minute        1   6855.4 6901.4
FALSE - total_eve_charge_per_minute_sqd    1   6855.4 6901.4
FALSE - total_intl_calls                   1   6855.7 6901.7
FALSE - number_customer_service_calls_sqd  1   6994.0 7040.0
FALSE - international_plan                 1   7419.0 7465.0
FALSE 
FALSE Step:  AIC=6870.01
FALSE churn ~ account_length + international_plan + voice_mail_plan + 
FALSE     number_vmail_messages + total_day_calls + total_eve_calls + 
FALSE     total_night_calls + total_intl_calls + number_customer_service_calls + 
FALSE     total_eve_charge_per_minute + total_night_charge_per_minute + 
FALSE     total_intl_charge_per_minute + number_vmail_messages_sqd + 
FALSE     total_day_calls_sqd + total_eve_calls_sqd + total_night_calls_sqd + 
FALSE     total_intl_calls_sqd + number_customer_service_calls_sqd + 
FALSE     total_day_charge_per_minute_sqd + total_eve_charge_per_minute_sqd + 
FALSE     total_night_charge_per_minute_sqd + total_intl_charge_per_minute_sqd
FALSE 
FALSE                                     Df Deviance    AIC
FALSE - total_night_calls_sqd              1   6824.5 6868.5
FALSE - total_night_calls                  1   6824.7 6868.7
FALSE - total_intl_charge_per_minute       1   6825.5 6869.5
FALSE - total_intl_charge_per_minute_sqd   1   6825.7 6869.7
FALSE <none>                                   6824.0 6870.0
FALSE - total_eve_calls                    1   6826.7 6870.7
FALSE - total_eve_calls_sqd                1   6826.9 6870.9
FALSE - total_night_charge_per_minute_sqd  1   6827.5 6871.5
FALSE - total_day_calls                    1   6827.5 6871.5
FALSE - total_night_charge_per_minute      1   6827.6 6871.6
FALSE - account_length                     1   6828.0 6872.0
FALSE - total_day_calls_sqd                1   6829.3 6873.3
FALSE - number_vmail_messages_sqd          1   6830.5 6874.5
FALSE - number_vmail_messages              1   6834.6 6878.6
FALSE - total_day_charge_per_minute_sqd    1   6836.5 6880.5
FALSE - number_customer_service_calls      1   6848.9 6892.9
FALSE - total_intl_calls_sqd               1   6849.3 6893.3
FALSE - voice_mail_plan                    1   6854.4 6898.4
FALSE - total_eve_charge_per_minute        1   6855.4 6899.4
FALSE - total_eve_charge_per_minute_sqd    1   6855.4 6899.4
FALSE - total_intl_calls                   1   6855.7 6899.7
FALSE - number_customer_service_calls_sqd  1   6994.0 7038.0
FALSE - international_plan                 1   7419.0 7463.0
FALSE 
FALSE Step:  AIC=6868.46
FALSE churn ~ account_length + international_plan + voice_mail_plan + 
FALSE     number_vmail_messages + total_day_calls + total_eve_calls + 
FALSE     total_night_calls + total_intl_calls + number_customer_service_calls + 
FALSE     total_eve_charge_per_minute + total_night_charge_per_minute + 
FALSE     total_intl_charge_per_minute + number_vmail_messages_sqd + 
FALSE     total_day_calls_sqd + total_eve_calls_sqd + total_intl_calls_sqd + 
FALSE     number_customer_service_calls_sqd + total_day_charge_per_minute_sqd + 
FALSE     total_eve_charge_per_minute_sqd + total_night_charge_per_minute_sqd + 
FALSE     total_intl_charge_per_minute_sqd
FALSE 
FALSE                                     Df Deviance    AIC
FALSE - total_night_calls                  1   6825.9 6867.9
FALSE - total_intl_charge_per_minute       1   6826.0 6868.0
FALSE - total_intl_charge_per_minute_sqd   1   6826.2 6868.2
FALSE <none>                                   6824.5 6868.5
FALSE - total_eve_calls                    1   6827.1 6869.1
FALSE - total_eve_calls_sqd                1   6827.4 6869.4
FALSE - total_night_charge_per_minute_sqd  1   6827.9 6869.9
FALSE - total_day_calls                    1   6828.1 6870.1
FALSE - total_night_charge_per_minute      1   6828.1 6870.1
FALSE - account_length                     1   6828.5 6870.5
FALSE - total_day_calls_sqd                1   6829.9 6871.9
FALSE - number_vmail_messages_sqd          1   6830.9 6872.9
FALSE - number_vmail_messages              1   6835.0 6877.0
FALSE - total_day_charge_per_minute_sqd    1   6836.8 6878.8
FALSE - number_customer_service_calls      1   6849.3 6891.3
FALSE - total_intl_calls_sqd               1   6849.6 6891.6
FALSE - voice_mail_plan                    1   6854.8 6896.8
FALSE - total_eve_charge_per_minute        1   6856.1 6898.1
FALSE - total_eve_charge_per_minute_sqd    1   6856.1 6898.1
FALSE - total_intl_calls                   1   6856.2 6898.2
FALSE - number_customer_service_calls_sqd  1   6994.3 7036.3
FALSE - international_plan                 1   7419.8 7461.8
FALSE 
FALSE Step:  AIC=6867.93
FALSE churn ~ account_length + international_plan + voice_mail_plan + 
FALSE     number_vmail_messages + total_day_calls + total_eve_calls + 
FALSE     total_intl_calls + number_customer_service_calls + total_eve_charge_per_minute + 
FALSE     total_night_charge_per_minute + total_intl_charge_per_minute + 
FALSE     number_vmail_messages_sqd + total_day_calls_sqd + total_eve_calls_sqd + 
FALSE     total_intl_calls_sqd + number_customer_service_calls_sqd + 
FALSE     total_day_charge_per_minute_sqd + total_eve_charge_per_minute_sqd + 
FALSE     total_night_charge_per_minute_sqd + total_intl_charge_per_minute_sqd
FALSE 
FALSE                                     Df Deviance    AIC
FALSE - total_intl_charge_per_minute       1   6827.4 6867.4
FALSE - total_intl_charge_per_minute_sqd   1   6827.7 6867.7
FALSE <none>                                   6825.9 6867.9
FALSE - total_eve_calls                    1   6828.6 6868.6
FALSE - total_eve_calls_sqd                1   6828.9 6868.9
FALSE - total_night_charge_per_minute_sqd  1   6829.4 6869.4
FALSE - total_day_calls                    1   6829.4 6869.4
FALSE - total_night_charge_per_minute      1   6829.6 6869.6
FALSE - account_length                     1   6829.8 6869.8
FALSE - total_day_calls_sqd                1   6831.2 6871.2
FALSE - number_vmail_messages_sqd          1   6832.4 6872.4
FALSE - number_vmail_messages              1   6836.5 6876.5
FALSE - total_day_charge_per_minute_sqd    1   6838.2 6878.2
FALSE - number_customer_service_calls      1   6850.3 6890.3
FALSE - total_intl_calls_sqd               1   6850.9 6890.9
FALSE - voice_mail_plan                    1   6856.3 6896.3
FALSE - total_eve_charge_per_minute        1   6857.4 6897.4
FALSE - total_eve_charge_per_minute_sqd    1   6857.4 6897.4
FALSE - total_intl_calls                   1   6857.4 6897.4
FALSE - number_customer_service_calls_sqd  1   6995.0 7035.0
FALSE - international_plan                 1   7421.8 7461.8
FALSE 
FALSE Step:  AIC=6867.44
FALSE churn ~ account_length + international_plan + voice_mail_plan + 
FALSE     number_vmail_messages + total_day_calls + total_eve_calls + 
FALSE     total_intl_calls + number_customer_service_calls + total_eve_charge_per_minute + 
FALSE     total_night_charge_per_minute + number_vmail_messages_sqd + 
FALSE     total_day_calls_sqd + total_eve_calls_sqd + total_intl_calls_sqd + 
FALSE     number_customer_service_calls_sqd + total_day_charge_per_minute_sqd + 
FALSE     total_eve_charge_per_minute_sqd + total_night_charge_per_minute_sqd + 
FALSE     total_intl_charge_per_minute_sqd
FALSE 
FALSE                                     Df Deviance    AIC
FALSE <none>                                   6827.4 6867.4
FALSE - total_eve_calls                    1   6830.1 6868.1
FALSE - total_eve_calls_sqd                1   6830.4 6868.4
FALSE - total_night_charge_per_minute_sqd  1   6830.8 6868.8
FALSE - total_day_calls                    1   6830.9 6868.9
FALSE - total_night_charge_per_minute      1   6830.9 6868.9
FALSE - account_length                     1   6831.3 6869.3
FALSE - total_day_calls_sqd                1   6832.7 6870.7
FALSE - number_vmail_messages_sqd          1   6833.8 6871.8
FALSE - number_vmail_messages              1   6837.9 6875.9
FALSE - total_day_charge_per_minute_sqd    1   6839.9 6877.9
FALSE - total_intl_charge_per_minute_sqd   1   6843.0 6881.0
FALSE - total_intl_calls_sqd               1   6852.1 6890.1
FALSE - number_customer_service_calls      1   6852.2 6890.2
FALSE - voice_mail_plan                    1   6857.6 6895.6
FALSE - total_intl_calls                   1   6858.5 6896.5
FALSE - total_eve_charge_per_minute        1   6858.6 6896.6
FALSE - total_eve_charge_per_minute_sqd    1   6858.6 6896.6
FALSE - number_customer_service_calls_sqd  1   6996.7 7034.7
FALSE - international_plan                 1   7421.9 7459.9
mod.log <- glm(s$formula, data = dftrain, family = binomial(link = "probit"))

predicted.log <- factor(ifelse(predict(mod.log, dftest[,-c(10)], type='response')>0.5,1,0))

confmat.log <- confusionMatrix(data=predicted.log, reference = dftest$churn, positive = '1')

CM_Function(confmat.log)

roc_score.log =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.log, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.log ,main ="ROC curve")

Gaussian SVM

library(e1071)
mod.svm  = svm(formula = churn ~ .,
               data = dftrain,
                 type = 'C-classification', # this is because we want to make a regression classification
                 kernel = 'radial')

predicted.svm <- predict(mod.svm, dftest[,-c(10)])

confmat.svm <- confusionMatrix(data=predicted.svm, reference = dftest$churn, positive = '1')

CM_Function(confmat.svm)

roc_score.svm =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.svm, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.svm ,main ="ROC curve")

Boosting

# library(gbm)
# 
# mod <-  gbm(churn ~.,
#                 data = dftrain,
#                 distribution = "gaussian",
#                 cv.folds = 10,
#                 shrinkage = .01,
#                 n.minobsinnode = 10,
#                 n.trees = 500)
# 
# predicted <- factor(ifelse(1/(1+exp(-2*predict.gbm(mod, dftest[,-c(10)])))>=0.5,1,0))
# 
# 
# dftrain <-  dftrain |>
#   mutate_if(is.factor, as.character) |>
#   mutate_if(is.character, as.numeric)
# 
# dftest <-  dftest |>
#   mutate_if(is.factor, as.character) |>
#   mutate_if(is.character, as.numeric)
# 
# xgb_train <- xgb.DMatrix(data = as.matrix(dftrain[,-c(10)]), label = dftrain$churn)
# xgb_test <- xgb.DMatrix(data = as.matrix(dftest[,-c(10)]), label = dftest$churn)
# xgb_params <- list(
#   booster = "gbtree",
#   eta = 0.01,
#   max_depth = 8,
#   gamma = 4,
#   subsample = 0.75,
#   colsample_bytree = 1,
#   objective = "multi:softprob",
#   eval_metric = "mlogloss",
#   num_class = 2)
# 
# xgb_model <- xgb.train(
#   params = xgb_params,
#   data = xgb_train,
#   nrounds = 100,
#   verbose = 1
# )
# 
# xgb_model
# 
# xgb_preds <- predict(xgb_model, as.matrix(dftest$churn), reshape = TRUE)
# xgb_preds <- as.data.frame(xgb_preds)
# colnames(xgb_preds) <- c(0,1)
# predicted <- ifelse(xgb_preds[,2] > 0.5, 1, 0)
# 
# CM_Function(confusionMatrix(data=predicted, reference = dftest$churn, positive = "1"))
# 
# library(pROC)
# roc_score=roc(factor(dftest$churn, ordered=TRUE), factor(predicted, ordered=TRUE)) #AUC score
# plot(roc_score ,main ="ROC curve")

Logistic Regression

#mod <- glm(churn ~., data = df_UNDER, family = binomial(link='logit'))
#summary(mod)

#churn_predicted <- factor(ifelse(predict(mod, df[,-c(10)], type = 'response') < 0.5, 'no', 'yes'))

#library(caret)

#LR_Confusion_Matrix_training <- confusionMatrix(data=churn_predicted, reference = df$churn, positive = 'yes')

#CM_Function(LR_Confusion_Matrix_training)

Log regression doesn’t work. Changing the decision boundary also doesn’t help. –> maybe we actually need to resample

https://www.r-bloggers.com/2021/05/class-imbalance-handling-imbalanced-data-in-r/ https://www.analyticsvidhya.com/blog/2016/03/practical-guide-deal-imbalanced-classification-problems/ https://datascientistdiary.com/index.php/2021/09/02/how-to-handle-imbalanced-data-example-in-r/ https://cran.r-project.org/web/packages/imbalance/vignettes/imbalance.pdf

LDA

# LDA Model Fit on Training
#LDA_training <- lda(churn~., df_UNDER)
#LDA_training

# Density Plot for Overlapping 
#p <- predict(LDA_training, df)
#ldahist(data = p$x[,1], g = df$churn, col = "#0D8387")

Confusion Matrix (Training VS Test)

# Training 60%
#LDA_predictions_training <- predict(LDA_training, df)$class

#LDA_Confusion_Matrix_training <- confusionMatrix(data = LDA_predictions_training, reference = df$churn, positive='yes')

#CM_Function(LDA_Confusion_Matrix_training)

Random Forest V2

# ROSE DATA

#rf <- randomForest(churn~., data=df_UNDER, proximity=TRUE)


#Predictions_OVER <- predict(rf, df_UNDER[,-10])
#CM_OVER <- confusionMatrix(Predictions_OVER, df_UNDER$churn, positive = 'yes')

#CM_Function(CM_OVER)

# ORIGINAL DATA

#Predictions_DATA <- predict(rf, df[,-10])

#CM_DATA <- confusionMatrix(Predictions_DATA, df$churn, positive = 'yes')

#CM_Function(CM_DATA)

Decision Trees

#tree <- rpart(churn ~., data = df)

#rpart.plot(tree)

#printcp(tree)
#plotcp(tree)

#Predictions_DT <- predict(tree, df[,-10])[,2]

#Predictions_DT <- ifelse(Predictions_DT > 0.5, "yes","no")

#Predictions_DT <- as.factor(Predictions_DT)

#CM_DATA <- confusionMatrix(Predictions_DT, df$churn, positive = 'yes')

#CM_Function(CM_DATA)

Neural Net

#library(caret)

#nn1 <- train(churn ~ ., data = df_UNDER, method = "nnet")

#nn1.pre <- predict(nn1, df[,-c(10)])

#confusionMatrix(nn1.pre, df$churn, positive = 'yes')
#CM_Function(confusionMatrix(nn1.pre, df$churn, positive = 'yes'))

Support Vector Machines

#library(e1071)

#classifierR = svm(formula = churn ~ .,
                 #data = dftrain,
                 #type = 'C-classification', # this is because we want to make a regression classification
                 #kernel = 'radial', 
                 #cost = 100,
                 #gamma = 20)


#svm_1 <- predict(object = classifierR, newdata = dftest[,-c(10)])

#confusionMatrix(svm_1, dftest$churn, positive = '1')

#CM_Function(confusionMatrix(svm_1, df$churn, positive = '1'))